library(car)
library(mosaic)
library(DT)
library(skimr)
library(tidyverse)
Made by Jim and Brent. Let’s take a look.
files <- dir()
rdat <- read.csv(grep("RBdata.csv", files, value=TRUE), header=TRUE)
datatable(rdat)
palette(gray(seq(0,.9,len = nrow(rdat))))
pairs(rdat, pch=16, cex=1.2, panel=panel.smooth, col.smooth="skyblue4", col=as.factor(rdat$Y))
palette(gray(seq(0,.9,len = nrow(rdat))))
pairs(cbind(logY = log(rdat$Y), rdat), pch=16, cex=1.2, panel=panel.smooth, col.smooth="skyblue4", col=as.factor(rdat$Y))
lm1 <- lm(Y ~ X1, data=rdat)
summary(lm1)
##
## Call:
## lm(formula = Y ~ X1, data = rdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -125.5 -122.1 -78.0 -64.5 6110.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78.75 45.44 1.733 0.0841 .
## X1 46.71 65.58 0.712 0.4769
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 567.5 on 298 degrees of freedom
## Multiple R-squared: 0.001699, Adjusted R-squared: -0.001651
## F-statistic: 0.5073 on 1 and 298 DF, p-value: 0.4769
boxCox(lm1, lambda = seq(-0.1, 0.05, 0.01))
lm1 <- lm(Y^-0.04 ~ X1, data=rdat)
summary(lm1)
##
## Call:
## lm(formula = Y^-0.04 ~ X1, data = rdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.253368 -0.060524 0.000226 0.061572 0.306126
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.939291 0.007284 128.948 <2e-16 ***
## X1 0.019103 0.010514 1.817 0.0702 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09098 on 298 degrees of freedom
## Multiple R-squared: 0.01096, Adjusted R-squared: 0.007638
## F-statistic: 3.301 on 1 and 298 DF, p-value: 0.07023
rdat$Yt <- rdat$Y^-0.04
rdat <- rdat[,c(12,1:11)]
palette(c("skyblue","orange","green"))
pairs(cbind(R = lm1$res, fit = lm1$fit, rdat), pch=16, cex=1.2, panel=panel.smooth, col.smooth="skyblue4", col=as.factor(rdat$X3))
X3 should be logged…
skim(rdat)
## Skim summary statistics
## n obs: 300
## n variables: 12
##
## ── Variable type:factor ──────────────────────────────────────────────
## variable missing complete n n_unique top_counts
## X7 0 300 300 288 (-<: 2, Ali: 2, Ame: 2, Bob: 2
## ordered
## FALSE
##
## ── Variable type:integer ─────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## X1 0 300 300 0.48 0.5 0 0 0 1 1 ▇▁▁▁▁▁▁▇
##
## ── Variable type:numeric ─────────────────────────────────────────────
## variable missing complete n mean sd p0 p25
## X10 0 300 300 2.06 1.14 0.00081 1.07
## X2 0 300 300 -3.02 2.1 -21.87 -3.98
## X3 129 171 300 2.1e+26 2e+27 3e-05 2377.6
## X4 0 300 300 -2.39 1.26 -5.66 -3.42
## X5 0 300 300 3.99 2.98 -2.26 1.4
## X6 0 300 300 2.11 1.15 0.011 1.2
## X8 0 300 300 4.24 3.96 -3.83 1
## X9 0 300 300 -8.32 6.48 -32.63 -12.18
## Y 0 300 300 101.18 567.06 0.0028 0.7
## Yt 0 300 300 0.95 0.091 0.71 0.88
## p50 p75 p100 hist
## 2.08 3.04 3.97 ▆▆▆▆▇▇▆▇
## -3.09 -2.35 10.95 ▁▁▁▁▇▁▁▁
## 2.1e+10 3.7e+15 2.1e+28 ▇▁▁▁▁▁▁▁
## -2.3 -1.4 1.48 ▁▅▆▇▇▆▁▁
## 4.04 6.46 10.84 ▂▇▇▇▇▇▆▁
## 2.27 3.04 3.99 ▆▆▆▆▇▇▇▇
## 1 9 9 ▁▁▁▇▁▁▁▆
## -6.86 -3.64 8.81 ▁▁▂▃▆▇▃▁
## 3.7 22.35 6235.63 ▇▁▁▁▁▁▁▁
## 0.95 1.01 1.26 ▁▂▇▇▇▂▁▁
X7 seems especially odd… let’s split at the " - " maybe the numbers are useful.
table(rdat$X7)
##
## (-<U+203F><U+203F>-) - 124 (-<U+203F><U+203F>-) - 16
## 1 1
## (-<U+203F><U+203F>-) - 174 (-<U+203F><U+203F>-) - 18
## 1 1
## (-<U+203F><U+203F>-) - 2 (-<U+203F><U+203F>-) - 31
## 1 2
## (-<U+203F><U+203F>-) - 32 (-<U+203F><U+203F>-) - 41
## 1 1
## (-<U+203F><U+203F>-) - 42 (-<U+203F><U+203F>-) - 6
## 1 1
## (-<U+203F><U+203F>-) - 62 (-<U+203F><U+203F>-) - 63
## 1 1
## (-<U+203F><U+203F>-) - 7 Alice - 0
## 1 2
## Alice - 1 Alice - 10
## 1 1
## Alice - 112 Alice - 118
## 1 1
## Alice - 136 Alice - 189
## 1 1
## Alice - 199 Alice - 3
## 1 1
## Alice - 455 Alice - 79
## 1 1
## Amelia - 103 Amelia - 134
## 1 1
## Amelia - 167 Amelia - 192
## 1 1
## Amelia - 2 Amelia - 273
## 1 1
## Amelia - 29 Amelia - 33
## 1 1
## Amelia - 40 Amelia - 43
## 1 2
## Amelia - 47 Amelia - 5
## 1 1
## Amelia - 69 Big Max - 0
## 1 1
## Big Max - 1 Big Max - 10
## 1 1
## Big Max - 1065 Big Max - 159
## 1 1
## Big Max - 230 Big Max - 26
## 1 1
## Big Max - 28 Big Max - 30
## 1 1
## Big Max - 313 Big Max - 371
## 1 1
## Big Max - 434 Big Max - 86
## 1 1
## Big Max - 88 Bobo - 1
## 1 1
## Bobo - 132 Bobo - 162
## 1 1
## Bobo - 2 Bobo - 206
## 2 1
## Bobo - 28 Bobo - 31
## 1 1
## Bobo - 45 Bobo - 87
## 1 1
## Bobo - 92 Cory - 0
## 1 1
## Cory - 1 Cory - 150
## 2 1
## Cory - 2 Cory - 314
## 1 1
## Cory - 38 Cory - 488
## 1 1
## Cory - 5 Cory - 68
## 1 1
## Cory - 71 Cory - 74
## 1 1
## Cory - 9 Curious George - 0
## 1 1
## Curious George - 106 Curious George - 137
## 1 1
## Curious George - 167 Curious George - 20
## 1 1
## Curious George - 252 Curious George - 288
## 1 1
## Curious George - 30 Curious George - 304
## 1 1
## Curious George - 389 Curious George - 5
## 1 1
## Curious George - 6 Curious George - 7
## 1 1
## Curious George - 85 Emily - 0
## 1 1
## Emily - 133 Emily - 147
## 1 1
## Emily - 189 Emily - 256
## 1 1
## Emily - 27 Emily - 274
## 1 1
## Emily - 318 Emily - 384
## 1 1
## Emily - 43 Emily - 46
## 1 1
## Emily - 8 Ferris - 0
## 1 1
## Ferris - 1 Ferris - 10
## 2 1
## Ferris - 125 Ferris - 2
## 1 1
## Ferris - 281 Ferris - 29
## 1 1
## Ferris - 35 Ferris - 46
## 1 1
## Ferris - 5 Ferris - 7
## 1 1
## Ferris - 73 Ferris - 74
## 1 1
## Ferris - 80 Ferris - 86
## 1 1
## Francine - 0 Francine - 144
## 1 1
## Francine - 17 Francine - 176
## 1 1
## Francine - 187 Francine - 25
## 1 1
## Francine - 3 Francine - 335
## 1 1
## Francine - 36 Francine - 5
## 1 1
## Francine - 56 Francine - 60
## 1 1
## Francine - 9 Henry - 14
## 1 1
## Henry - 16 Henry - 22
## 1 1
## Henry - 27 Henry - 34
## 1 1
## Henry - 36 Henry - 4
## 1 1
## Henry - 469 Jacqueline - 154
## 1 1
## Jacqueline - 203 Jacqueline - 21
## 1 1
## Jacqueline - 216 Jacqueline - 30
## 1 1
## Jacqueline - 47 Jacqueline - 66
## 1 2
## Jacqueline - 7 Jacqueline - 86
## 1 1
## Jacqueline - 90 Jessie - 137
## 1 1
## Jessie - 16 Jessie - 219
## 1 1
## Jessie - 22 Jessie - 256
## 1 1
## Jessie - 315 Jessie - 32
## 1 1
## Jessie - 70 Jessie - 79
## 1 1
## John - 10 John - 17
## 1 1
## John - 208 John - 216
## 1 1
## John - 22 John - 275
## 1 1
## John - 30 John - 323
## 1 1
## John - 368 John - 385
## 1 1
## John - 46 John - 66
## 1 1
## John - 85 John - 9
## 1 1
## Johnny James Yogurt Jr - 1 Johnny James Yogurt Jr - 140
## 1 1
## Johnny James Yogurt Jr - 157 Johnny James Yogurt Jr - 2
## 1 1
## Johnny James Yogurt Jr - 29 Johnny James Yogurt Jr - 345
## 1 1
## Johnny James Yogurt Jr - 594 Johnny James Yogurt Jr - 77
## 1 1
## Johnny James Yogurt Jr - 78 Johnny James Yogurt Jr - 8
## 1 1
## Johnny James Yogurt Jr - 90 Kystystal - 0
## 1 1
## Kystystal - 105 Kystystal - 11
## 1 1
## Kystystal - 135 Kystystal - 197
## 1 1
## Kystystal - 2 Kystystal - 227
## 1 1
## Kystystal - 27 Kystystal - 40
## 1 1
## Kystystal - 54 Kystystal - 55
## 1 1
## Kystystal - 57 Kystystal - 625
## 1 1
## Kystystal - 74 Kystystal - 9
## 1 1
## Marcel the Shell - 0 Marcel the Shell - 1
## 2 1
## Marcel the Shell - 135 Marcel the Shell - 147
## 1 1
## Marcel the Shell - 2 Marcel the Shell - 211
## 1 1
## Marcel the Shell - 27 Marcel the Shell - 29
## 1 2
## Marcel the Shell - 3 Marcel the Shell - 33
## 1 1
## Marcel the Shell - 449 Marcel the Shell - 48
## 1 1
## Marcel the Shell - 5 Marcel the Shell - 51
## 1 1
## Marcel the Shell - 69 Moose - 0
## 1 1
## Moose - 1 Moose - 16
## 1 1
## Moose - 187 Moose - 289
## 1 1
## Moose - 299 Moose - 313
## 1 1
## Moose - 397 Moose - 581
## 1 1
## Moose - 86 Olivia - 105
## 1 1
## Olivia - 139 Olivia - 16
## 1 1
## Olivia - 168 Olivia - 210
## 1 1
## Olivia - 29 Olivia - 36
## 1 1
## Olivia - 388 Olivia - 44
## 1 1
## Olivia - 52 Olivia - 662
## 1 1
## Olivia - 68 Olivia - 71
## 1 1
## Olivia - 870 Orion - 0
## 1 1
## Orion - 1 Orion - 115
## 1 1
## Orion - 117 Orion - 145
## 1 1
## Orion - 148 Orion - 16
## 1 1
## Orion - 2 Orion - 250
## 1 1
## Orion - 28 Orion - 32
## 1 1
## Orion - 328 Orion - 5
## 1 1
## Orion - 94 S C Kennedy - 135
## 1 1
## S C Kennedy - 152 S C Kennedy - 2
## 1 1
## S C Kennedy - 35 S C Kennedy - 36
## 1 1
## S C Kennedy - 40 S C Kennedy - 5
## 1 1
## S C Kennedy - 72 S C Kennedy - 78
## 1 1
## Sarah - 102 Sarah - 15
## 1 1
## Sarah - 18 Sarah - 186
## 1 1
## Sarah - 30 Sarah - 300
## 1 1
## Sarah - 32 Sarah - 377
## 1 1
## Sarah - 459 Sarah - 47
## 1 1
## Sarah - 660 Thor - 0
## 1 1
## Thor - 144 Thor - 189
## 1 1
## Thor - 26 Thor - 3
## 1 2
## Thor - 36 Thor - 5
## 1 1
## Thor - 56 Thor - 8
## 1 1
## Thor - 9 Thor - 92
## 1 1
## Zach - 10 Zach - 135
## 1 1
## Zach - 151 Zach - 19
## 1 2
## Zach - 32 Zach - 39
## 1 1
## Zach - 4 Zach - 485
## 2 1
## Zach - 6 Zach - 71
## 1 1
rdat <- separate(rdat, col=X7, into=c("X7.1","X7.2"), sep = " - ")
rdat <- as.data.frame(rdat)
rdat$X2.1 <- rdat$X2^(1/3)
rdat$X3.1 <- log(rdat$X3+1)
rdat$X7.1 <- as.factor(rdat$X7.1)
rdat$X7.2 <- as.numeric(rdat$X7.2)
glimpse(rdat)
## Observations: 300
## Variables: 15
## $ Yt <dbl> 0.8758811, 0.8814728, 0.9824715, 1.1595656, 1.0238355, 0....
## $ Y <dbl> 27.47078738, 23.43028592, 1.55596955, 0.02469544, 0.55493...
## $ X1 <int> 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, ...
## $ X2 <dbl> -3.0668674, -4.2638940, -2.4616331, -2.8691551, -2.199544...
## $ X3 <dbl> NA, 1.209923e+08, 9.137040e+07, 3.648866e+03, NA, 1.04338...
## $ X4 <dbl> -1.0158286, -1.2215244, -2.7545422, -4.3892816, -2.143609...
## $ X5 <dbl> 5.8467221, 3.4308835, 2.8617592, 1.4579335, 1.9219055, 1....
## $ X6 <dbl> 3.62008230, 3.62740321, 0.76221983, 0.16364930, 2.3162938...
## $ X7.1 <fct> Olivia, Cory, S C Kennedy, Big Max, Sarah, Alice, Marcel ...
## $ X7.2 <dbl> 36, 71, 2, 10, 377, 199, 51, 44, 10, 47, 90, 36, 31, 10, ...
## $ X8 <dbl> 1, 1, 9, 9, 9, 9, 1, 1, 9, 1, 1, 9, 9, 1, 1, 1, 1, 1, 1, ...
## $ X9 <dbl> -6.010369, -8.412774, 1.574063, -3.233196, -19.415281, -1...
## $ X10 <dbl> 1.71989128, 2.91232062, 2.79434218, 1.21254593, 3.3632002...
## $ X2.1 <dbl> NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, NaN, Na...
## $ X3.1 <dbl> NA, 18.61123770, 18.33043215, 8.20244570, NA, 9.25290273,...
palette(gray(seq(0,.9,len = nrow(rdat))))
pairs(cbind(R = lm1$res, fit = lm1$fit, rdat), pch=16, cex=1.2, panel=panel.smooth, col.smooth="skyblue4", col=as.factor(rdat$Yt))
lm2 <- lm(Yt ~ log(X3), data=rdat)
summary(lm2)
##
## Call:
## lm(formula = Yt ~ log(X3), data = rdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24416 -0.06446 0.00145 0.06415 0.30774
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9340958 0.0119621 78.088 <2e-16 ***
## log(X3) 0.0005542 0.0004038 1.373 0.172
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09513 on 169 degrees of freedom
## (129 observations deleted due to missingness)
## Multiple R-squared: 0.01103, Adjusted R-squared: 0.005176
## F-statistic: 1.884 on 1 and 169 DF, p-value: 0.1717
lm3 <- lm(Yt ~ X7.2*X9, data=rdat)
summary(lm3)
##
## Call:
## lm(formula = Yt ~ X7.2 * X9, data = rdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.243503 -0.063263 0.000302 0.065959 0.310846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.387e-01 1.066e-02 88.071 <2e-16 ***
## X7.2 -1.031e-04 2.922e-04 -0.353 0.724
## X9 -2.309e-03 3.064e-03 -0.753 0.452
## X7.2:X9 -1.129e-06 7.847e-06 -0.144 0.886
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09162 on 296 degrees of freedom
## Multiple R-squared: 0.003803, Adjusted R-squared: -0.006294
## F-statistic: 0.3766 on 3 and 296 DF, p-value: 0.7699
palette("default")
pairs(cbind(R = lm2$res, fit = lm2$fit, rdat[!is.na(rdat$X3),]), pch=16, cex=1, panel=panel.smooth, col.smooth="skyblue4", col=as.factor(rdat$X7.1))
lm4 <- lm(Yt ~ I(X10^2)*I(X6^2), data=rdat)
summary(lm4)
##
## Call:
## lm(formula = Yt ~ I(X10^2) * I(X6^2), data = rdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.247008 -0.059866 -0.000932 0.064496 0.314375
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9550130 0.0127488 74.910 <2e-16 ***
## I(X10^2) -0.0001212 0.0017931 -0.068 0.946
## I(X6^2) -0.0020872 0.0016942 -1.232 0.219
## I(X10^2):I(X6^2) 0.0001859 0.0002265 0.821 0.412
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09143 on 296 degrees of freedom
## Multiple R-squared: 0.007887, Adjusted R-squared: -0.002168
## F-statistic: 0.7844 on 3 and 296 DF, p-value: 0.5034
I look forward to seeing how you built your data. You’ve got me stumped. Here is my final model, \(Y_i' = \beta_0 + \epsilon_i\) where \(Y' = log(Y)\).
lm5 <- lm(Yt ~ 1, data=rdat)
summary(lm5)
##
## Call:
## lm(formula = Yt ~ 1, data = rdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.243435 -0.065319 0.000573 0.065887 0.316059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.948460 0.005273 179.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09133 on 299 degrees of freedom